package com.kdtech.analyse.Bbs;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.DoMainUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.NumberUtils;
import com.kdtech.utils.StringUtils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.kdtech.analyse.AnalyseNews;
import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;

import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
/**
 * 柳州新闻网论坛
 */
public class LznewsBbsAnalyse implements AnalyseNews{

	
	public boolean isDetailPage(String url){
		boolean bRet = false;
		String[] regex = {
				"http://home.lznews.gov.cn/thread-[0-9]*-1-1.html",
				"http://home.lznews.gov.cn/forum.php\\?from=portal&mod=viewthread&tid=[0-9]*",
				"http://home.lznews.gov.cn/forum.php\\?extra=.*&mod=viewthread&tid=[0-9]*"
				};
		for (int i = 0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta bbs = new NewsMeta();
		String htmltxt = urlMeta.getHtml();
		String url = urlMeta.getUrl();
		if(!isDetailPage(url)){
		}
		bbs.setUrl(url);
		String title = null;
		String content = null;
		Long date = null;
		String author = null;
		Integer clickNum=null;
		Integer commentNum=null;

		Document doc = Jsoup.parse(htmltxt);
		title = doc.select("h1.ts").text();
		if(StringUtils.isBlank(title)){
			title = doc.select("h1.ph").text();
		}
		Element firstDiv = doc.select("div#postlist > div[id^=post_]").first();
		if(firstDiv!=null){
			author = firstDiv.select("div.authi a.xw1").text();
			date = DateUtils.matchDate(firstDiv.select("div.authi span").attr("title"));
			content = HtmlCleaner.getContentHtml(url, firstDiv.select("td.t_f"));
		}
		firstDiv = doc.select("div.mn div.bm").first();
		if(firstDiv!=null){
			date = DateUtils.matchDate(firstDiv.select("p.xg1").text());
			content = HtmlCleaner.getContentHtml(url, firstDiv.select("td.t_f"));
			author = JSoupUtils.matchAuthor(doc, "发布者: ");
		}
		String text = doc.select("td.pls div.hm").text();
		clickNum = NumberUtils.matchNumber(StringUtils.substringBefore(text, "|"));
		commentNum = NumberUtils.matchNumber(StringUtils.substringAfter(text, "|"));

		if(commentNum==null && clickNum==null){
			text = doc.select("p.xg1").text();
			clickNum = NumberUtils.matchNumber(StringUtils.substringBetween(text, "查看数", "|"));
			commentNum = NumberUtils.matchNumber(StringUtils.substringBetween(text, "评论数", "|"));
		}

		if(date==null){
			date=DateUtils.matchDate(doc.select(".authi").text());
		}
		bbs.setClickNum(clickNum);
		bbs.setCommentNum(commentNum);
		bbs.setTitle(StringUtils.trimSpace(title));
		bbs.setContent(content);
		bbs.setDate(date);
		bbs.setAuthor(author);
		bbs.setUpdateUrl(url);
		return bbs;
	}
	
	public NewsMeta Update(NewsMeta meta) {
		String url = meta.getUpdateUrl();
		if (StringUtils.isNotBlank(url)){
			UrlMeta urlMeta = CrawlHTML.responseToURL(url);
			return parserHtml(urlMeta);
		}
		return null;
	}
    
	
}
